#Importing the dictionarries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as scp
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
# Reading the Dataframe
MasterDF = pd.read_csv('SampleSuperstore.csv')
MasterDF.head()
| Ship Mode | Segment | Country | City | State | Postal Code | Region | Category | Sub-Category | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Second Class | Consumer | United States | Henderson | Kentucky | 42420 | South | Furniture | Bookcases | 261.9600 | 2 | 0.00 | 41.9136 |
| 1 | Second Class | Consumer | United States | Henderson | Kentucky | 42420 | South | Furniture | Chairs | 731.9400 | 3 | 0.00 | 219.5820 |
| 2 | Second Class | Corporate | United States | Los Angeles | California | 90036 | West | Office Supplies | Labels | 14.6200 | 2 | 0.00 | 6.8714 |
| 3 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | Furniture | Tables | 957.5775 | 5 | 0.45 | -383.0310 |
| 4 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | Office Supplies | Storage | 22.3680 | 2 | 0.20 | 2.5164 |
# Describing
MasterDF.describe()
| Postal Code | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|
| count | 9994.000000 | 9994.000000 | 9994.000000 | 9994.000000 | 9994.000000 |
| mean | 55190.379428 | 229.858001 | 3.789574 | 0.156203 | 28.656896 |
| std | 32063.693350 | 623.245101 | 2.225110 | 0.206452 | 234.260108 |
| min | 1040.000000 | 0.444000 | 1.000000 | 0.000000 | -6599.978000 |
| 25% | 23223.000000 | 17.280000 | 2.000000 | 0.000000 | 1.728750 |
| 50% | 56430.500000 | 54.490000 | 3.000000 | 0.200000 | 8.666500 |
| 75% | 90008.000000 | 209.940000 | 5.000000 | 0.200000 | 29.364000 |
| max | 99301.000000 | 22638.480000 | 14.000000 | 0.800000 | 8399.976000 |
MasterDF.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9994 entries, 0 to 9993 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Ship Mode 9994 non-null object 1 Segment 9994 non-null object 2 Country 9994 non-null object 3 City 9994 non-null object 4 State 9994 non-null object 5 Postal Code 9994 non-null int64 6 Region 9994 non-null object 7 Category 9994 non-null object 8 Sub-Category 9994 non-null object 9 Sales 9994 non-null float64 10 Quantity 9994 non-null int64 11 Discount 9994 non-null float64 12 Profit 9994 non-null float64 dtypes: float64(3), int64(2), object(8) memory usage: 1015.1+ KB
MasterDF.shape
(9994, 13)
sns.pairplot(MasterDF)
<seaborn.axisgrid.PairGrid at 0x250a0c19ee0>
MasterDF['Postal Code'] = MasterDF['Postal Code'].astype('str')
MasterDF.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9994 entries, 0 to 9993 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Ship Mode 9994 non-null object 1 Segment 9994 non-null object 2 Country 9994 non-null object 3 City 9994 non-null object 4 State 9994 non-null object 5 Postal Code 9994 non-null object 6 Region 9994 non-null object 7 Category 9994 non-null object 8 Sub-Category 9994 non-null object 9 Sales 9994 non-null float64 10 Quantity 9994 non-null int64 11 Discount 9994 non-null float64 12 Profit 9994 non-null float64 dtypes: float64(3), int64(1), object(9) memory usage: 1015.1+ KB
def barp(df, varx, vary, hue):
plt.figure(figsize=[20,10])
sns.barplot(data = df, x=varx, y=vary, hue=hue, order= df[varx].value_counts().index)
a = varx+ " vs "+ vary
plt.title( a, fontsize=15 )
plt.xticks(rotation=90)
plt.show()
def count(df, varx, hue):
plt.figure(figsize=[30,10])
sns.countplot(data = df, x=varx, hue=hue)
a = 'Count of',varx
plt.title(a, fontsize=15 )
plt.xticks(rotation=90)
plt.show()
def scatterp(df, varx, vary, hue, alpha = 0.5):
plt.figure(figsize=[20,10])
sns.scatterplot(data = df, x=varx, y=vary, hue=hue, alpha=alpha )
a = varx+ " vs "+ vary
plt.title(a, fontsize=15 )
plt.show()
def regp(df, varx, vary):
plt.figure(figsize=[20,10])
sns.regplot(data = df, x=varx, y=vary )
a = varx+ " vs "+ vary
plt.title(a, fontsize=15 )
plt.grid(axis = 'y')
plt.show()
def jointp(df, varx, vary, alpha=0.5):
plt.figure(figsize=[20,10])
sns.jointplot(data = df, x=varx, y=vary)
a = varx+ " vs "+ vary
plt.title(a, fontsize=15 )
plt.grid(axis = 'y')
plt.show()
barp(MasterDF, 'Ship Mode', 'Sales', 'Segment')
barp(MasterDF[MasterDF.Sales > 4000], 'City', 'Sales', 'Segment')
barp(MasterDF[MasterDF.Sales > 4000], 'State', 'Sales', 'Segment')
barp(MasterDF, 'Category', 'Sales', 'Segment')
regp(MasterDF, 'Sales', 'Profit')
regp(MasterDF, 'Quantity', 'Sales')
regp(MasterDF, 'Quantity', 'Profit')
regp(MasterDF, 'Discount', 'Sales')
regp(MasterDF, 'Discount', 'Profit')
CorDF = MasterDF.corr(method ='pearson')
CorDF
| Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|
| Sales | 1.000000 | 0.200795 | -0.028190 | 0.479064 |
| Quantity | 0.200795 | 1.000000 | 0.008623 | 0.066253 |
| Discount | -0.028190 | 0.008623 | 1.000000 | -0.219487 |
| Profit | 0.479064 | 0.066253 | -0.219487 | 1.000000 |
plt.figure(figsize=[10,10])
sns.heatmap(CorDF, annot=True)
plt.title('Correlation Among Various Variables')
plt.show()
MasterDF['Profit&Loss'] = ['Profit' if x > 0 else 'Loss' for x in MasterDF.Profit]
count(MasterDF, 'Ship Mode', 'Category')
count(MasterDF, 'Profit&Loss', 'Category')
sns.countplot(MasterDF['Ship Mode'], order= MasterDF['Ship Mode'].value_counts().index )
<AxesSubplot:xlabel='Ship Mode', ylabel='count'>
fig = px.scatter_3d(MasterDF, 'Sales', 'Discount', 'Profit')
fig.show()